import pandas as pd
import folium
import math
from folium import plugins
ala_occurrences = pd.read_csv( r"data/ALA_EcoCommons_Data_Analyst_dataset.csv" )
griis_australia_taxon = pd.read_csv( r"data/dwca-griis-australia-v1.6/taxon.txt", sep="\t" )
griis_australia_speciesprofile = pd.read_csv( r"data/dwca-griis-australia-v1.6/speciesprofile.txt", sep="\t" )
all_species = ala_occurrences["scientificName"]
The first step is to collect the data from the GBIF Backbone Taxonomy. I define two functions to assist with this.
import requests
from datetime import datetime
from time import sleep
import os.path
def match_names( name_usages, temp_file = "match_results_temp.csv", load_previous = False ):
if load_previous == True:
if os.path.isfile(temp_file):
results = pd.read_csv( temp_file, index_col = "nameToMatch", dtype=str )
else:
print("(match_names) No such file: " + temp_file + "\nAborting.")
return
# only process names which haven't yet been saved
scientificNamesToProcess = name_usages[~name_usages.isin(results.index)]
else:
# we will use this variable later
results = None
# process all names
scientificNamesToProcess = name_usages
# start timer
count = 0
startTime = datetime.now()
# iterate through each name and send a request to the GBIF API
for nameToMatch in scientificNamesToProcess:
# get response from GBIF API
# if the connection fails, retry again in one minute
# abort after five failed attempts
response = None
while response is None:
attempts = 1
try:
response = requests.get(
'https://api.gbif.org/v1/species/match?name=%22'
+ nameToMatch.replace(" ", "%20")
+ '%22'
)
except:
print( "(match_names) Connection error at " + str(datetime.now()) + " (attempt " + str(attempts) + "); ", end='')
if attempts > 5:
print( "Too many attempts. Aborting." )
else:
print( "retrying in 1min..." )
attempts + attempts + 1
sleep(60)
pass
# parse response from server
match_results = pd.json_normalize(response.json())
match_results["nameToMatch"] = nameToMatch
match_results.set_index( "nameToMatch", inplace = True )
# this methodology is a little convoluted, but if we begin with an empty
# dataframe and merge or concat another dataframe to it, the data type of
# the columns is lost. So we either start with the first response if this is our
# first query, and then on later queries we add to that reponse
# this approach may break at some point
if count == 0:
results = match_results.astype(str)
else:
results = results.append( match_results.astype(str) )
count = count + 1
print( "(match_names) name " + str(count) + ", '" + nameToMatch + "', is done")
# save results every 10 requests
if count % 10 == 0:
print("(match_names) saving results to " + temp_file)
results.to_csv( temp_file )
# sleep to ensure that the file doesn't get overwritten while it's still marked as busy
sleep(1)
print( "(match_names) Time taken:" )
print(datetime.now() - startTime)
# save final file so it can be used if required
results.to_csv( temp_file )
return results
# find a unique list of all species names used across all datasets
all_nameusages = ala_occurrences["scientificName"].append(griis_australia_taxon["scientificName"]).drop_duplicates()
# find the GBIF usage ID for all the name usages that we consider
# this is the usage ID for the scientific name
# the GBIF API returns the usage ID for the accepted name
# so we need to make a separate request to find the accepted name for the species. This is
# done using a separate function later in the code.
# I use my name here to indicate that this is data I have generated
stewart_matching_results = match_names( all_nameusages, load_previous=False )
def get_species_data( taxon_ids, temp_file = "accepted_name_temp.csv", load_previous = False ):
if load_previous == True:
if os.path.isfile(temp_file):
results = pd.read_csv( temp_file, index_col = "taxon_ids", dtype=str )
else:
print("(get_accepted_names) No such file: " + temp_file + "\nAborting.")
return
# only process names which haven't yet been saved
taxon_ids_to_process = taxon_ids[~taxon_ids.isin(results.index)]
else:
# we will use this variable later
results = None
# process all names
taxon_ids_to_process = taxon_ids
# start timer
count = 0
startTime = datetime.now()
print( "(get_accepted_names) Getting data for " + str(len(taxon_ids_to_process)) + " records" )
# iterate through each taxon ID and send a request to the GBIF API
for taxonID in taxon_ids_to_process:
# get response from GBIF API
# if the connection fails, retry again in one minute
# abort after five failed attempts
response = None
while response is None:
attempts = 1
try:
response = requests.get(
'https://api.gbif.org/v1/species/'
+ str(taxonID)
)
except:
print( "(get_accepted_names) Connection error at " + str(datetime.now()) + " (attempt " + str(attempts) + "); ", end='')
if attempts > 5:
print( "Too many attempts. Aborting." )
else:
print( "retrying in 1min..." )
attempts + attempts + 1
sleep(60)
pass
response_dataframe = pd.json_normalize(response.json())
response_dataframe["taxon_id_to_query"] = taxonID
response_dataframe.set_index( "taxon_id_to_query", inplace = True )
if count == 0:
results = response_dataframe.astype(str)
else:
results = results.append( response_dataframe.astype(str) )
count = count + 1
print( "(get_accepted_names) Taxon ID " + str(count) + ", '" + taxonID + "', is done")
# save results every 10 requests
if count % 10 == 0:
print("(get_accepted_names) saving results to " + temp_file)
results.to_csv( temp_file )
# sleep to ensure that the file doesn't get overwritten while it's still marked as busy
sleep(1)
print( "(get_accepted_names) Time taken:" )
print(datetime.now() - startTime)
# save final file so it can be used if required
results.to_csv( temp_file )
return results
# select taxon IDs for names which were identified as synonyms by the GBIF API
accepted_names_to_find = stewart_matching_results[~stewart_matching_results["acceptedUsageKey"].isna()]["acceptedUsageKey"]
accepted_names_to_find
nameToMatch
Parasuta dwyeri 7799658
Abelia ×grandiflora (Rovelli ex André) Rehder 2888249
Abutilon pictum (Gillies ex Hook.) Walp. 7900084
Acacia erioloba E.Mey. 3974776
Acacia sieberiana DC. 7564113
...
Verbena quadrangularis Vell. 2925518
Vigna caracalla (L.) Verdc. 8308653
Wisteria sinensis Sweet 8149049
Zinnia violacea Cav. 3111754
Zoobotryon verticillatum (Delle Chiaje, 1822) 7446481
Name: acceptedUsageKey, Length: 215, dtype: object
# find the accepted name for of those taxon IDs
accepted_names = get_species_data( accepted_names_to_find, load_previous=False )
accepted_names
# take the accepted name, and place it on to the matching results
shortened_accepted_names = accepted_names[["key", "canonicalName"]].rename(columns={"canonicalName":"pAcceptedCanonicalName"})
stewart_matching_results["nameToMatch"] = stewart_matching_results.index
stewart_matching_results = stewart_matching_results.merge( shortened_accepted_names, left_on="acceptedUsageKey", right_on="key", how="left")
stewart_matching_results
| usageKey | scientificName | canonicalName | rank | status | confidence | matchType | kingdom | phylum | order | ... | familyKey | genusKey | speciesKey | synonym | class | acceptedUsageKey | note | nameToMatch | key | pAcceptedCanonicalName | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2462672 | Eulamprus tympanum (Lönnberg & Andersson, 1915) | Eulamprus tympanum | SPECIES | ACCEPTED | 98 | EXACT | Animalia | Chordata | Squamata | ... | 9115 | 2462629 | 2462672 | False | Reptilia | NaN | NaN | Eulamprus tympanum | NaN | NaN |
| 1 | 5789593 | Liopholis whitii (Lacépède, 1804) | Liopholis whitii | SPECIES | ACCEPTED | 98 | EXACT | Animalia | Chordata | Squamata | ... | 9115 | 2462698 | 5789593 | False | Reptilia | NaN | NaN | Liopholis whitii | NaN | NaN |
| 2 | 2463636 | Hemiergis Wagler, 1830 | Hemiergis | GENUS | ACCEPTED | 94 | EXACT | Animalia | Chordata | Squamata | ... | 9115 | 2463636 | NaN | False | Reptilia | NaN | NaN | Hemiergis | NaN | NaN |
| 3 | 2462156 | Pseudemoia entrecasteauxii Duméril & Bibron, 1839 | Pseudemoia entrecasteauxii | SPECIES | ACCEPTED | 98 | EXACT | Animalia | Chordata | Squamata | ... | 9115 | 2462149 | 2462156 | False | Reptilia | NaN | NaN | Pseudemoia entrecasteauxii | NaN | NaN |
| 4 | 2464014 | Acritoscincus duperreyi (Gray, 1838) | Acritoscincus duperreyi | SPECIES | ACCEPTED | 98 | EXACT | Animalia | Chordata | Squamata | ... | 9115 | 2463863 | 2464014 | False | Reptilia | NaN | NaN | Acritoscincus duperreyi | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3081 | 3039423 | Ziziphus jujuba Mill. | Ziziphus jujuba | SPECIES | ACCEPTED | 100 | EXACT | Plantae | Tracheophyta | Rosales | ... | 2407 | 3039411 | 3039423 | False | Magnoliopsida | NaN | NaN | Ziziphus jujuba Mill. | NaN | NaN |
| 3082 | 3039424 | Ziziphus mauritiana Lam. | Ziziphus mauritiana | SPECIES | ACCEPTED | 100 | EXACT | Plantae | Tracheophyta | Rosales | ... | 2407 | 3039411 | 3039424 | False | Magnoliopsida | NaN | NaN | Ziziphus mauritiana Lam. | NaN | NaN |
| 3083 | 3875567 | Ziziphus mucronata Willd. | Ziziphus mucronata | SPECIES | ACCEPTED | 100 | EXACT | Plantae | Tracheophyta | Rosales | ... | 2407 | 3039411 | 3875567 | False | Magnoliopsida | NaN | NaN | Ziziphus mucronata Willd. | NaN | NaN |
| 3084 | 8228089 | Ziziphus spina-christi (L.) Desf. | Ziziphus spina-christi | SPECIES | ACCEPTED | 100 | EXACT | Plantae | Tracheophyta | Rosales | ... | 2407 | 3039411 | 8228089 | False | Magnoliopsida | NaN | NaN | Ziziphus spina-christi (L.) Desf. | NaN | NaN |
| 3085 | 1010205 | Zoobotryon verticillatum (Delle Chiaje, 1822) | Zoobotryon verticillatum | SPECIES | SYNONYM | 99 | EXACT | Animalia | Bryozoa | Ctenostomatida | ... | 6904 | 1010211 | 7446481 | True | Gymnolaemata | 7446481 | NaN | Zoobotryon verticillatum (Delle Chiaje, 1822) | 7446481 | Amathia verticillata |
3086 rows × 27 columns
# we then need to create a column that takes the canonical name from the accepted column preferentially:
stewart_matching_results["acceptedCanonicalName"] = stewart_matching_results.apply(
lambda row: row["canonicalName"] if pd.isnull(row["pAcceptedCanonicalName"]) else row["pAcceptedCanonicalName"],
axis=1
)
stewart_matching_results.columns
Index(['usageKey', 'scientificName', 'canonicalName', 'rank', 'status',
'confidence', 'matchType', 'kingdom', 'phylum', 'order', 'family',
'genus', 'species', 'kingdomKey', 'phylumKey', 'classKey', 'orderKey',
'familyKey', 'genusKey', 'speciesKey', 'synonym', 'class',
'acceptedUsageKey', 'note', 'nameToMatch', 'key',
'pAcceptedCanonicalName', 'acceptedCanonicalName'],
dtype='object')
# then, we finalise our 'key' or mini taxonomic backbone which links our two datasets together
taxonomic_backbone = stewart_matching_results[["nameToMatch","acceptedCanonicalName"]].rename(columns={"nameToMatch": "nameUsage"})
taxonomic_backbone.to_csv("taxonomic_backbone.csv")
taxonomic_backbone
| nameUsage | acceptedCanonicalName | |
|---|---|---|
| 0 | Eulamprus tympanum | Eulamprus tympanum |
| 1 | Liopholis whitii | Liopholis whitii |
| 2 | Hemiergis | Hemiergis |
| 3 | Pseudemoia entrecasteauxii | Pseudemoia entrecasteauxii |
| 4 | Acritoscincus duperreyi | Acritoscincus duperreyi |
| ... | ... | ... |
| 3081 | Ziziphus jujuba Mill. | Ziziphus jujuba |
| 3082 | Ziziphus mauritiana Lam. | Ziziphus mauritiana |
| 3083 | Ziziphus mucronata Willd. | Ziziphus mucronata |
| 3084 | Ziziphus spina-christi (L.) Desf. | Ziziphus spina-christi |
| 3085 | Zoobotryon verticillatum (Delle Chiaje, 1822) | Amathia verticillata |
3086 rows × 2 columns
# Now, we have to add the gbif name to each dataset, as the key that links them together
# note, not all names in the ALA dataset have a match in the GBIF dataset
# this would need to be followed up
# add gbif canonical name to ALA data
ala_occurrences = ala_occurrences.merge( taxonomic_backbone, left_on="scientificName", right_on="nameUsage", how="left" )
ala_occurrences
| recordID | decimalLatitude | decimalLongitude | eventDate | family | scientificName | taxonConceptID | dataResourceName | basisOfRecord | recordedBy | ABS_SA2_region_2016 | nameUsage | acceptedCanonicalName | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | d30e9204-3d31-49dd-8eaf-14211833fab9 | -35.907400 | 149.046000 | NaN | Scincidae | Eulamprus tympanum | urn:lsid:biodiversity.org.au:afd.taxon:48f412e... | ACT Wildlife Atlas | UNKNOWN | NaN | NAMADGI | Eulamprus tympanum | Eulamprus tympanum |
| 1 | 84691aeb-0185-44c2-9da3-4f75234a8992 | -35.893600 | 149.036000 | 2001-11-01T13:00:00Z | Scincidae | Liopholis whitii | urn:lsid:biodiversity.org.au:afd.taxon:ae2be63... | Museums Victoria provider for OZCAM | MATERIAL_SAMPLE | Monash University | NAMADGI | Liopholis whitii | Liopholis whitii |
| 2 | 875008d8-92db-49ec-9473-12269130308f | -35.893600 | 149.036000 | 2001-11-01T13:00:00Z | Scincidae | Liopholis whitii | urn:lsid:biodiversity.org.au:afd.taxon:ae2be63... | Museums Victoria provider for OZCAM | MATERIAL_SAMPLE | Monash University | NAMADGI | Liopholis whitii | Liopholis whitii |
| 3 | f691c807-cedc-4a0c-96ab-888203546c01 | -35.893600 | 149.036000 | 2001-11-01T13:00:00Z | Scincidae | Liopholis whitii | urn:lsid:biodiversity.org.au:afd.taxon:ae2be63... | Museums Victoria provider for OZCAM | MATERIAL_SAMPLE | Monash University | NAMADGI | Liopholis whitii | Liopholis whitii |
| 4 | 9526022f-b99e-4135-b501-a38089ebeb11 | -35.892810 | 148.976750 | 2018-02-02T13:00:00Z | Scincidae | Hemiergis | urn:lsid:biodiversity.org.au:afd.taxon:96f7532... | Australian National Wildlife Collection provid... | PRESERVED_SPECIMEN | NaN | COOMA REGION | Hemiergis | Hemiergis |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6906 | 904462fa-69b0-4378-b2c5-974df940b387 | -35.116700 | 150.766700 | 1976-12-03T13:00:00Z | Scincidae | Lampropholis delicata | urn:lsid:biodiversity.org.au:afd.taxon:82f18b8... | Australian National Wildlife Collection provid... | PRESERVED_SPECIMEN | NaN | JERVIS BAY | Lampropholis delicata | Lampropholis delicata |
| 6907 | 4c33b868-a834-4aca-b9a9-f87731b0173d | -35.116700 | 150.766700 | 1976-12-04T13:00:00Z | Scincidae | Lampropholis delicata | urn:lsid:biodiversity.org.au:afd.taxon:82f18b8... | Australian National Wildlife Collection provid... | PRESERVED_SPECIMEN | NaN | JERVIS BAY | Lampropholis delicata | Lampropholis delicata |
| 6908 | 5c5496e2-0cc1-46b6-bdd6-566ad14c149a | -35.116700 | 150.766700 | 1976-12-04T13:00:00Z | Scincidae | Ctenotus taeniolatus | urn:lsid:biodiversity.org.au:afd.taxon:4fce537... | Australian National Wildlife Collection provid... | PRESERVED_SPECIMEN | NaN | JERVIS BAY | Ctenotus taeniolatus | Ctenotus taeniolatus |
| 6909 | 2d87fa78-9524-4259-bfcb-55938539e509 | -35.116034 | 150.767127 | 2021-09-09T00:56:00Z | Elapidae | Pseudechis porphyriacus | urn:lsid:biodiversity.org.au:afd.taxon:c1d2d6c... | iNaturalist Australia | HUMAN_OBSERVATION | Maple, Dion | JERVIS BAY | Pseudechis porphyriacus | Pseudechis porphyriacus |
| 6910 | 3db4939c-68b6-4311-8766-c6a351f983c7 | -35.112514 | 150.768177 | 2006-08-09T14:00:00Z | Cheloniidae | Chelonia mydas | urn:lsid:biodiversity.org.au:afd.taxon:2d60547... | NSW BioNet Atlas | HUMAN_OBSERVATION | OPXC06081800 | NaN | Chelonia mydas | Chelonia mydas |
6911 rows × 13 columns
# add GBIF canonical name to GRIIS data
griis_simplified = griis_australia_taxon.merge( taxonomic_backbone, left_on="scientificName", right_on="nameUsage", how="left" )
griis_simplified["alien"] = True
griis_simplified = griis_simplified[["acceptedCanonicalName", "alien"]].rename(columns={ "scientificName":"griisScientificName"})
griis_simplified
| acceptedCanonicalName | alien | |
|---|---|---|
| 0 | Pyrola | True |
| 1 | Abelmoschus manihot | True |
| 2 | Abutilon grandiflorum | True |
| 3 | Callianthe picta | True |
| 4 | Abutilon theophrasti | True |
| ... | ... | ... |
| 2985 | Ziziphus jujuba | True |
| 2986 | Ziziphus mauritiana | True |
| 2987 | Ziziphus mucronata | True |
| 2988 | Ziziphus spina-christi | True |
| 2989 | Amathia verticillata | True |
2990 rows × 2 columns
# check how many occurrences are alien
ala_occurrences["acceptedCanonicalName"].isin(griis_simplified["acceptedCanonicalName"]).value_counts()
False 6330 True 581 Name: acceptedCanonicalName, dtype: int64
# now we can finally add the GRIIS classification to the ALA data
ala_occurrences = ala_occurrences.merge( griis_simplified, on="acceptedCanonicalName", how="left" )
ala_occurrences
| recordID | decimalLatitude | decimalLongitude | eventDate | family | scientificName | taxonConceptID | dataResourceName | basisOfRecord | recordedBy | ABS_SA2_region_2016 | nameUsage | acceptedCanonicalName | alien | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | d30e9204-3d31-49dd-8eaf-14211833fab9 | -35.907400 | 149.046000 | NaN | Scincidae | Eulamprus tympanum | urn:lsid:biodiversity.org.au:afd.taxon:48f412e... | ACT Wildlife Atlas | UNKNOWN | NaN | NAMADGI | Eulamprus tympanum | Eulamprus tympanum | NaN |
| 1 | 84691aeb-0185-44c2-9da3-4f75234a8992 | -35.893600 | 149.036000 | 2001-11-01T13:00:00Z | Scincidae | Liopholis whitii | urn:lsid:biodiversity.org.au:afd.taxon:ae2be63... | Museums Victoria provider for OZCAM | MATERIAL_SAMPLE | Monash University | NAMADGI | Liopholis whitii | Liopholis whitii | NaN |
| 2 | 875008d8-92db-49ec-9473-12269130308f | -35.893600 | 149.036000 | 2001-11-01T13:00:00Z | Scincidae | Liopholis whitii | urn:lsid:biodiversity.org.au:afd.taxon:ae2be63... | Museums Victoria provider for OZCAM | MATERIAL_SAMPLE | Monash University | NAMADGI | Liopholis whitii | Liopholis whitii | NaN |
| 3 | f691c807-cedc-4a0c-96ab-888203546c01 | -35.893600 | 149.036000 | 2001-11-01T13:00:00Z | Scincidae | Liopholis whitii | urn:lsid:biodiversity.org.au:afd.taxon:ae2be63... | Museums Victoria provider for OZCAM | MATERIAL_SAMPLE | Monash University | NAMADGI | Liopholis whitii | Liopholis whitii | NaN |
| 4 | 9526022f-b99e-4135-b501-a38089ebeb11 | -35.892810 | 148.976750 | 2018-02-02T13:00:00Z | Scincidae | Hemiergis | urn:lsid:biodiversity.org.au:afd.taxon:96f7532... | Australian National Wildlife Collection provid... | PRESERVED_SPECIMEN | NaN | COOMA REGION | Hemiergis | Hemiergis | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9632 | 904462fa-69b0-4378-b2c5-974df940b387 | -35.116700 | 150.766700 | 1976-12-03T13:00:00Z | Scincidae | Lampropholis delicata | urn:lsid:biodiversity.org.au:afd.taxon:82f18b8... | Australian National Wildlife Collection provid... | PRESERVED_SPECIMEN | NaN | JERVIS BAY | Lampropholis delicata | Lampropholis delicata | NaN |
| 9633 | 4c33b868-a834-4aca-b9a9-f87731b0173d | -35.116700 | 150.766700 | 1976-12-04T13:00:00Z | Scincidae | Lampropholis delicata | urn:lsid:biodiversity.org.au:afd.taxon:82f18b8... | Australian National Wildlife Collection provid... | PRESERVED_SPECIMEN | NaN | JERVIS BAY | Lampropholis delicata | Lampropholis delicata | NaN |
| 9634 | 5c5496e2-0cc1-46b6-bdd6-566ad14c149a | -35.116700 | 150.766700 | 1976-12-04T13:00:00Z | Scincidae | Ctenotus taeniolatus | urn:lsid:biodiversity.org.au:afd.taxon:4fce537... | Australian National Wildlife Collection provid... | PRESERVED_SPECIMEN | NaN | JERVIS BAY | Ctenotus taeniolatus | Ctenotus taeniolatus | NaN |
| 9635 | 2d87fa78-9524-4259-bfcb-55938539e509 | -35.116034 | 150.767127 | 2021-09-09T00:56:00Z | Elapidae | Pseudechis porphyriacus | urn:lsid:biodiversity.org.au:afd.taxon:c1d2d6c... | iNaturalist Australia | HUMAN_OBSERVATION | Maple, Dion | JERVIS BAY | Pseudechis porphyriacus | Pseudechis porphyriacus | NaN |
| 9636 | 3db4939c-68b6-4311-8766-c6a351f983c7 | -35.112514 | 150.768177 | 2006-08-09T14:00:00Z | Cheloniidae | Chelonia mydas | urn:lsid:biodiversity.org.au:afd.taxon:2d60547... | NSW BioNet Atlas | HUMAN_OBSERVATION | OPXC06081800 | NaN | Chelonia mydas | Chelonia mydas | NaN |
9637 rows × 14 columns
center = [-35.462633768371354, 148.98941016640208]
map_act = folium.Map(location=center, zoom_start=8)
marker_cluster = plugins.MarkerCluster().add_to(map_act)
for index, occurrence in ala_occurrences.iterrows():
location = [occurrence['decimalLatitude'], occurrence['decimalLongitude']]
if occurrence["alien"] == True:
folium.Marker(
location,
popup = f'Scientific name:{occurrence["scientificName"]} (alien) <br />Collector:{occurrence["dataResourceName"]}',
icon=folium.Icon(color="red")
).add_to(marker_cluster)
else:
folium.Marker(
location,
popup = f'Scientific name:{occurrence["scientificName"]} <br />Collector:{occurrence["dataResourceName"]}',
icon=folium.Icon(color="green")
).add_to(marker_cluster)
# display the map
# species known to be alien are shown with red markers
# note that the map only shows a subset of the data - that which could be mapped to the GBIF Taxonomic Backbone
map_act